# -*- coding: utf-8 -*-
import re
import scrapy
from copy import copy, deepcopy
from scrapy import Request
from scinet.items import ScinetItem
from urllib.parse import quote
from lxml import etree


class ScienceSpider(scrapy.Spider):
    name = 'science'
    allowed_domains = ['blog.sciencenet.cn']
    host_url = "http://blog.sciencenet.cn/"
    base_url = "http://blog.sciencenet.cn/blog.php?mod=member&type="
    blog_base_url = "http://blog.sciencenet.cn/home.php?mod=space&uid="
    blogger_base_url = "http://blog.sciencenet.cn/blog.php?mod=member&type={}&realmmedium={}&realm={}&catid={}"

    realm_list = ["生命科学", "医学科学", "化学科学", "工程材料", "信息科学", "地球科学", "数理科学", "管理综合"]

    def start_requests(self):
        # 通过一级学科构造起始url列表
        for realm in self.realm_list:
            url = self.base_url + quote(realm, encoding="gbk")
            start_request = Request(url=url, callback=self.parse)
            start_request.headers["Referer"] = "http://blog.sciencenet.cn/blog.php"
            yield start_request

    def parse(self, response):
        item = ScinetItem()
        # 提取一级学科名称
        item["realm"] = response.xpath("//div[@id='con_head']/text()").extract_first()
        # 二级学科名称列表
        realm_medium_list = response.xpath("//div[@class='box_line']")
        for realm_medium_item in realm_medium_list:
            # 提取二级学科名称
            item["realm_medium"] = realm_medium_item.xpath("./div[@class='box_l']/text()").extract_first()
            # 三级学科名称列表
            realm_type_list = realm_medium_item.xpath("./div[@class='box_r']/ul/li")
            for realm_type_item in realm_type_list:
                # 提取三级学科名称
                item["realm_type"] = realm_type_item.xpath("./a/@title").extract_first()
                item["catid"] = re.match(
                    r".*?&catid=(\d+)", realm_type_item.xpath("./a/@href").extract_first(),
                    re.S).group(1)
                # 构造三级学科对应的博主列表页的url地址
                realm = quote(item["realm"], encoding="gbk")
                realm_medium = quote(item["realm_medium"], encoding="gbk")
                realm_type = quote(item["realm_type"], encoding="gbk")
                blogger_list_url = self.blogger_base_url.format(realm_type, realm_medium, realm, item["catid"])
                # 构造请求
                # 坑！必须用深拷贝而不是赋值，否则会发生覆盖，导致传递的item大量重复
                item_copy = deepcopy(item)
                blogger_list_request = Request(url=blogger_list_url, callback=self.parse_blogger_list_page, meta={"item": item_copy})
                blogger_list_request.headers["Referer"] = self.base_url + realm
                # 生成请求
                yield blogger_list_request

    def parse_blogger_list_page(self, response):
        item = response.meta["item"]
        # 获取博主列表也的标签列表
        blogger_list = response.xpath("//div[@class='potbox']//p[@class='potfont']")
        if blogger_list.extract():
            for blogger in blogger_list:
                # 博主姓名
                item["name"] = blogger.xpath("./a/text()").extract_first()
                # 博客id
                item["uid"] = re.match(r".*?&uid=(\d+)", blogger.xpath("./a/@href").extract_first(), re.S).group(1)
                blog_url = self.blog_base_url + item["uid"]
                item_copy = deepcopy(item)
                blog_request = Request(url=blog_url, callback=self.parse_blog_page, meta={"item": item_copy})
                yield blog_request
        else:
            # 该三级学科没有相关博主
            item["name"] = "Null"
            item["uid"] = "Null"
            yield item

    def parse_blog_page(self, response):
        item = response.meta["item"]
        # 如果未设置访问权限时，则可获取到以下的博主信息标签
        blogger_info_list = response.xpath("//li[@class='ul_diy']").extract()

        if blogger_info_list:
            for blogger_info in blogger_info_list:
                if "工作情况" in blogger_info:
                    html = etree.HTML("<ul>" + blogger_info + "</ul>")
                    info = html.xpath("//li/text()")
                    if info:
                        info_list = info[0].split("，")
                        item["institute"] = info_list[0]
                        if len(info_list) == 3:
                            item["title"] = info_list[2]
                        elif "副研究员" in info[0]:
                            item["title"] = "副研究员"
                        elif "副教授" in info[0]:
                            item["title"] = "副教授"
                        elif "教授" in info[0]:
                            item["title"] = "教授"
                        elif "研究员" in info[0]:
                            item["title"] = "助理研究员"
                        elif "助理研究员" in info[0]:
                            item["title"] = "研究员"
                        elif "讲师" in info[0]:
                            item["title"] = "讲师"
                        elif ("薄厚" in info[0] or "博后" in info[0] or "博士后" in info[0]):
                            item["title"] = "博士后"
                        else:
                            item["title"] = "其他"
                elif "教育情况" in blogger_info:
                    html = etree.HTML("<ul>" + blogger_info + "</ul>")
                    info = html.xpath("//li/text()")
                    if info:
                        info_list = info[0].split("，")
                        item["institute"] = info_list[0]
                        if ("博士" in info[0] or "博士在读" in info[0]):
                            item["title"] = "博士"
                        elif ("硕士" in info[0] or "硕士在读" in info[0]):
                            item["title"] = "硕士"
                        elif ("本科" in info[0] or "本科在读" in info[0]):
                            item["title"] = "本科"
                        else:
                            item["title"] = "其他"
            # 获取博客访问总数
            visitors = response.xpath("//div[@id='pcd']/p/strong/text()").extract_first()
            if visitors and (not visitors == "--"):
                item["visitors"] = int(visitors)
            else:
                item["visitors"] = 0

            # 获取博客博文总数
            articles = response.xpath("//div[@id='pcd']/ul/li[7]/a/text()").extract()
            if len(articles) == 2:
                articles.pop(articles.index("学术名片"))
                if articles[0] == "--":
                    item["articles"] = 0
                else:
                    item["articles"] = int(articles[0])
            else:
                item["articles"] = 0

            # 获取博客活跃度
            activity = response.xpath("//div[@id='pcd']/ul/li[4]/a/text()").extract()
            if len(activity) == 2:
                activity.pop(activity.index("给我留言"))
                if activity[0] == "--":
                    item["activity"] = 0
                else:
                    item["activity"] = int(activity[0])
            else:
                item["activity"] = 0
        elif re.findall(r"<li>博客访问量: (.*?)</li>", response.body.decode("gbk"), re.S):
            # 如果博主设置了访问权限，则从提示页提取相关数据
            visitors = re.findall(r"<li>博客访问量: (.*?)</li>", response.body.decode("gbk"), re.S)[0].strip()
            articles = re.findall(r"<li>帖子数: (.*?)</li>", response.body.decode("gbk"), re.S)[0].strip()
            activity = re.findall(r"<li>活跃度: (.*?) ℃</li>", response.body.decode("gbk"), re.S)[0].strip()
            if visitors.isdigit():
                item["visitors"] = int(visitors)
            else:
                item["visitors"] = 0
            if articles.isdigit():
                item["articles"] = int(articles)
            else:
                item["articles"] = 0
            if activity.isdigit():
                item["activity"] = int(activity)
            else:
                item["activity"] = 0
        else:
            # 说明已被封博，或者其他原因无法获取信息，则在pipeline中dropitem
            yield item

        # 如果获取到的头衔仍旧为空或者未能获取到
        if not item.get("title"):
            item["title"] = "其他"
        # 如果获取到的单位仍旧为空或者未能获取到
        if not item.get("institute"):
            item["institute"] = "其他"
        # print("+"*50)
        # print(item)
        yield item